OHI-Northeast | OHI Science | Citation policy
knitr::opts_chunk$set(message = FALSE, warning = FALSE)
library(tidyverse)
library(crul)
library(rvest)
library(xml2)
source("~/github/ne-prep/src/R/common.R")“Landings data do not indicate the physical location of harvest but the location at which the landings either first crossed the dock or were reported from.”
conn <- HttpClient$new(
url = "https://www.st.nmfs.noaa.gov/pls/webpls/MF_ANNUAL_LANDINGS.RESULTS"
)res <- conn$post(body = list(
qspecies = "ALL SPECIES INDIVIDUALLY",
qreturn = "search",
qyearfrom = 1980, #it looks like this is the minimum available year for this region.
qyearto = 2016,
qstate = "New England By State",
qoutput_type = "TABLE"
))
x <- res$parse("UTF-8")
rvest::html_table(xml2::read_html(x), fill = TRUE)
#assign object and save as dataframe
ne_data <- rvest::html_table(xml2::read_html(x), fill = TRUE) %>%
as.data.frame()res <- conn$post(body = list(
qspecies = "ALL SPECIES INDIVIDUALLY",
qreturn = "search",
qyearfrom = 1980, #it looks like this is the minimum available year for this region. I originally used 1900 as qyearfrom and 1945 was min returned.
qyearto = 2016,
qstate = "New York",
qoutput_type = "TABLE"
))
x <- res$parse("UTF-8")
ny_data <- rvest::html_table(xml2::read_html(x), fill = TRUE) %>%
as.data.frame() %>%
mutate(State = "New York")The column “X” is actually dollars. We can remove this column.
noaa_data <- ne_data %>%
rbind(ny_data) %>%
select(-X.) %>%
rename(tons = Metric.Tons, pounds = Pounds, state = State, year = Year, species = Species) %>%
filter(!is.na(state),
!is.na(species),
state != "", #these rows are subtotals which we can calculate on our own
state != "-") #this row is "grand total" which we can calculate on our ownMost of the species are written like “TUNA, BLUEFIN”. But I would like this to be “Bluefin Tuna”. This isn’t super easy since there are other values like “SHRIMP, MARINE, OTHER” that don’t lend themselves to a simple separate and paste.
nd <- noaa_data %>%
mutate(common_name = species) %>%
tidyr::separate(common_name, into = c("one", "two", "three"), sep = ", ") %>%
mutate(common = paste0(two, " ", one)) %>%
mutate(final_name = case_when(
is.na(two) ~ one,
is.na(three) ~ common,
!is.na(three) ~ common
)) %>%
select(-one, -two, -three, -common, -species) %>%
rename(species = final_name) %>%
mutate(pounds = as.numeric(gsub(",", "", pounds)),
year = as.numeric(year))
write_csv(nd, "data/noaa_catch_statistics.csv")#read in raw data
nd <- read_csv("data/noaa_catch_statistics.csv") %>%
filter(year > 2002) %>%
mutate(pounds_in_thou = pounds/1000) #column in thousands of pounds for better plotting
p <- ggplot(nd, aes(x = year, y = pounds_in_thou, color = species)) +
geom_line() +
theme_bw() +
facet_wrap(~state, scales = "free") +
theme(legend.position = "none")
plotly::ggplotly(p)I need to add in the regions, which will be all the state waters. Massachusetts will remain as “Massachusetts” and the final goal score for Wild-Caught Fisheries will be the same for both MA regions. I’m also changing all species to lower case to match with RAM later. Even though the NOAA data only goes to 2016, we add the year 2017 for the toolbox. We also need to gapfill missing data. When a species/state combination has missing data for a year, we are going to assume that there was zero catch for that species/year. We also calculate a rolling average of catch. This is done to account for any wild fluctuations in catch year to year.
tbx_lyr <- nd %>%
mutate(species = tolower(species)) %>%
group_by(state, species) %>%
complete(year = 1980:2017) %>%
ungroup() %>%
mutate(catch = ifelse(is.na(pounds), 0, pounds)) %>%
group_by(state, species) %>%
arrange(year) %>%
mutate(mean_catch = zoo::rollapply(catch, 3, mean, fill = NA, align = 'right')) %>% ## create a new column `mean_catch` with rolling mean of 3 yrs
ungroup() %>%
left_join(rgn_data) %>%
filter(year > 2004) %>%
select(year, state, rgn_id, species, mean_catch)
# save to toolbox
write.csv(tbx_lyr, file = file.path(dir_calc, "layers/fis_meancatch.csv"))Plot mean catch after gapfilling. This is the data used in scoring.
p <- ggplot(tbx_lyr, aes(x = year, y = mean_catch/1000, color = species)) +
geom_line() +
theme_bw() +
facet_wrap(~state, scales = "free") +
labs(y = "Mean catch (1000s lbs)",
x = "") +
theme(legend.position = "none")
plotly::ggplotly(p)